---
title: "Balancing algorithm report"
output:
  html_document:
    df_print: paged
editor_options:
  chunk_output_type: console
---

```{r,message=FALSE,echo=FALSE}
library(dplyr)
```

```{r,echo=FALSE,message=FALSE,warning=FALSE}
df <- read.csv("romania_clean.csv")

est <- read.csv("Romaniaestimates.csv")
est$est=est$est/100
est$Freq=nrow(df) * est$est
est$est=NULL
X <- split(est, est$variable)
X=lapply(X, function(x) { x["variable"] <- NULL; x })
list2env(X ,.GlobalEnv)
joint.d=rename(joint, joint=label)
pol.d=rename(pol, pol=label)
sex.d=rename(sex, sex=label)
vot.d=rename(vot, vot=label)

library(survey)
df$ids=seq(1:nrow(df))
df1 <- svydesign(ids=~1, data=df)

df.r <- rake(design = df1,
             sample.margins = list(~joint, ~pol, ~sex, ~vot),
             population.margins = list(joint.d, pol.d, sex.d, vot.d))

df.r1 <- trimWeights(df.r, lower=0.6, upper=5,
                                   strict=TRUE) 
df$w=weights(df.r1)
df$Freq=round(df$w,0)

df0=select(df,Record.ID,joint,pol,sex,vot,Freq)
MyData <- df0[rep(row.names(df0), df0$Freq), 1:5]
write.csv(MyData,"MyData.csv",row.names=F)

```

# Weighting report
```{r}
# Frequencies of replicated obeservations
table(df$Freq)

# N of injected data
nrow(MyData)-nrow(df)
```


# Python (config = code) 
```{python engine.path="/usr/bin/python3.5",results='hide'}
import sys
sys.path.append('/home/shared/data/balancing/')
import drop
config = "{1: {'degreeHigh':4.1,'noDegreeHigh':45.8,'degreeLow':5.6,'noDegreeLow':44.4}},{2: {'interest':51.1,'LowInterest':48.9}},{3: {'Male':49.9,'Female':50.1}},{4: {'USD':38.89,'PNL':15.52,'PDL':12.65,'UDMR':6.51,'PMP':6.43,'Undec':20.00}}"

filename = '/home/fernando/Misc/bjpolFinal/Romania/MyData.csv'

drop.run(config, filename, "/home/fernando/Misc/bjpolFinal/Romania/myBal", 2.5)
```

# Results
```{r,echo=FALSE}
#MyData <- read.csv("MyData.csv")
cols=names(MyData)
myBal <- read.csv("myBal.csv",header = F)
names(myBal)=cols

```


### Proportion of dataset retained
```{r}

# N of Original data
nrow(df)

# N of Weighted data
nrow(MyData)

# N of Balanced data
N=nrow(myBal)
N

# N of Unique entries
uID=length(unique(myBal$Record.ID))
uID

# Percent unique of total 
uID/N

# Percent of weighted data frame maintained
100*(nrow(myBal)/nrow(MyData))
```

## Summary

```{r}
# Summary of original data distribution (%)
origProp=sapply(MyData[-1], function(x) round(100*prop.table(table(x)),1))
origProp=round(unlist(origProp),1)

origN=sapply(MyData[-1], function(x) table(x))
origN=unlist(origN)

# Summary of balancing results (%) 
balProp=sapply(myBal[-1], function(x) round(100*prop.table(table(x)),1))
balProp=round(unlist(balProp),1)

# DESIRED data distribution (%)
estimates <- read.csv("Romaniaestimates.csv")

estimates=data.frame(estimates %>% 
  group_by(variable) %>% 
  arrange(label,.by_group = TRUE))

summ=data.frame(cbind(estimates,balProp,origProp,origN))
summ$gap=with(summ, round(est/origProp,2))
# Summary table provides: 
# Desired proportions: est
# Proportions after balancing: balProp
# Original proportions: origProp
# Orignal frequencies: origN
# Gap, High numbers indicate big gap
```


```{r,echo=FALSE}
# Function that returns Root Mean Squared Error
rmse <- function(error)
{
    sqrt(mean(error^2))
}
 
# Function that returns Mean Absolute Error
mae <- function(error)
{
    mean(abs(error))
}
 
# Calculate error
error <- with(summ, est-balProp)
```

### RMSE and MAE
```{r}
# Root mean square error
rmse(error)

# Mean absolute error
mae(error)
```

### Comparison across balancing variables
```{r,message=FALSE,echo=FALSE,warning=FALSE}
library(knitr)
library(kableExtra)

summ %>%
  kable(row.names = F) %>%
  kable_styling()
```


```{r,echo=FALSE}
## Save balanced file
myBal2=merge(myBal,df,by="Record.ID",all=F,sort=F)
write.csv(myBal2,"myBal2.csv",row.names = F)


```

### Final table of weighted observations retained
```{r,echo=FALSE}
table(myBal2$Freq)
```